Step 2: Summarize Data Broadly
Step 2.1: Sample metadata summary
# Pull the sample metadata.
metadata <- Extract_Sample_Meta(obj,
sample_col = "Sample_Name",
variables_include = c("Diagnosis",
"Sample_Source",
"Sample_Type",
"Tobacco",
"Status",
"Gender",
"Age",
"Ethnicity"))
# Quickly clean up variables.
unique(metadata$Tobacco)
## [1] "Y" "N"
## [3] "Former smoker" NA
## [5] "Former Smoker" "Tobacco use disorder/ in remission"
## [7] "Quit 1997" "Quit 30 years ago"
## [9] "Yes"
metadata$Tobacco <- gsub("\\bY\\b", "Yes", metadata$Tobacco)
metadata$Tobacco <- gsub("\\bN\\b", "No", metadata$Tobacco)
metadata$Tobacco <- gsub("Smoker", "smoker", metadata$Tobacco)
metadata$Tobacco <- gsub("Quit 30 years ago", "Former smoker", metadata$Tobacco)
metadata$Tobacco <- gsub("Quit 1997", "Former smoker", metadata$Tobacco)
# Define columns to plot.
columns_incl <- colnames(metadata)
columns_incl <- columns_incl[columns_incl != "Sample_Name"]
# Loop through columns to plot and visualize.
for (col in columns_incl) {
p <- ggplot(metadata, aes_string(x = col)) +
geom_bar(fill = "darkslategrey", colour = "black") +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
labs(title = paste("Distribution of", col, "in GSE227136"),
x = col,
y = "Count") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.








# Remove unneeded objects.
rm(col, columns_incl, p, metadata)
Step 3: ANTXR1 Expression
Step 3.1: Where is it expressed?
# ANTXR1 UMAP expression.
Idents(obj) <- "cell_type"
FeaturePlot(obj,
reduction = "umap",
features = "ANTXR1",
split.by = "Diagnosis",
pt.size = 0.8,
label = TRUE,
label.size = 4,
repel = TRUE,
order = TRUE,
raster = FALSE,
cols = c("#d3d3d3", "#edc9af", "#6e260e")) +
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8),
strip.text = element_text(size = 15))

# ANTXR1 expression count.
antxr1_summary <- (FetchData(obj, vars = c("Sample_Name", "cell_type", "Diagnosis", "ANTXR1")) %>%
mutate(ANTXR1_pos = ANTXR1 > 0,
ANTXR1_neg = ANTXR1 <= 0)
)
# ANTXR1+ fibroblast by conditions.
antxr1_summary %>%
dplyr::filter(cell_type == "Fibroblasts") %>%
ggplot(aes(x = Diagnosis, fill = ANTXR1_pos)) +
geom_bar(width = 0.5, position = "dodge") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Number of non-zero ANTXR1+ fibroblasts")

# ANTXR1- fibroblast by conditions.
antxr1_summary %>%
dplyr::filter(cell_type == "Fibroblasts") %>%
ggplot(aes(x = Diagnosis, fill = ANTXR1_neg)) +
geom_bar(width = 0.5, position = "dodge") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("Number of non-zero ANTXR1- fibroblasts")

Step 3.2: What are the level of ANTXR1 expression?
# Subset fibroblasts only.
Idents(obj) <- "cell_type"
subset_fibroblast <- subset(obj, idents = c("Fibroblasts"), invert = FALSE)
subset_fibroblast$Diagnosis <- factor(subset_fibroblast$Diagnosis, levels = c("Control", "IPF"))
# Calculate median expression between both conditions.
expression_level <- (FetchData(subset_fibroblast,
vars = c("cell_type", "Diagnosis", "ANTXR1")) %>%
group_by(Diagnosis) %>%
summarise(median_expression = median(ANTXR1),
mean_expression = mean(ANTXR1))
)
print(expression_level)
## # A tibble: 2 × 3
## Diagnosis median_expression mean_expression
## <fct> <dbl> <dbl>
## 1 Control 0 0.253
## 2 IPF 0.693 0.497
# Calculate p-value comparisons. p-adjusted would not make sense here since we are only doing 1 comparison, Control vs. IPF.
expression_stats <- (FetchData(subset_fibroblast,
vars = c("cell_type", "Diagnosis", "ANTXR1")) %>%
wilcox_test(data = ., ANTXR1 ~ Diagnosis) %>%
adjust_pvalue(method = "bonferroni") %>%
add_significance("p.adj")
)
print(expression_stats)
## # A tibble: 1 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 ANTXR1 Control IPF 2108 4516 3610559 1.38e-69 1.38e-69 ****
# Plot violin plot comparing the level of expression between both conditions.
source("Z:/Kendrix/IPF_Lung/VlnPlot_stat.R")
VlnPlot_stat(object = subset_fibroblast,
gene_signature = c("ANTXR1"),
test_sign = list(c("Control", "IPF")),
group_name = "Diagnosis",
title = "ANTXR1 pairwise comparisons between control and
IPF fibroblasts",
x_angle = 0,
y_max = 3.5,
hjust = 0.5,
vjust = 1)

Step 4: Differential Expression
Step 4.1: Pseudobulk expression on sample level
# Check if default ident of the Seurat object is cell type.
all(Idents(obj) == obj$cell_type) # True. Default idents are cell types.
## [1] TRUE
# Aggregate counts based on cell_type, condition and accession ID (i.e. mice ID).
pseudo_seu <- AggregateExpression(obj,
assays = "RNA",
group.by = c("cell_type", "Diagnosis", "Sample_Name"),
return.seurat = TRUE)
# Use the new metadata column as the default ident.
head(Cells(pseudo_seu))
## [1] "AT1_Control_THD0001" "AT1_Control_THD0002" "AT1_Control_THD0005"
## [4] "AT1_Control_THD0006" "AT1_Control_THD0007" "AT1_Control_THD0008"
# Create an aggregate variable with cell_type and condition.
pseudo_seu$deg_variable <- paste(pseudo_seu$cell_type, pseudo_seu$Diagnosis, sep = "_")
# Set deg_variable as the default ident.
Idents(pseudo_seu) <- "deg_variable"
# Connect to AnnotationHub.
ah <- AnnotationHub()
# Access the Ensembl database for organism.
ahDb <- query(ah,
pattern = c("Homo sapiens", "EnsDb"),
ignore.case = TRUE)
# Acquire the latest annotation files.
id <- ahDb %>%
mcols() %>%
rownames() %>%
tail(n = 1)
# Download the appropriate Ensembldb database.
edb <- ah[[id]]
## loading from cache
# Extract gene-level information from database.
annotations <- genes(edb,
return.type = "data.frame")
# Select annotations of interest.
annotations <- annotations %>%
dplyr::select(gene_id, gene_name, seq_name, gene_biotype, description)
# Remove unneeded objects.
rm(ah, ahDb, edb, id)
Step 4.3: Explore DEG output
# Explore DEG output.
datatable(
fibroblasts_deg %>%
arrange(p_val_adj, avg_log2FC),
caption = "Significantly expressed genes between Control and IPF Fibroblasts",
options = list(pageLength = 10, scrollX = TRUE),
rownames = FALSE
) %>%
formatRound(columns = c("avg_log2FC"), digits = 3) %>%
formatSignif(columns = c("p_val"), digits = 3) %>%
formatSignif(columns = c("p_val_adj"), digits = 3)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html